knitr::opts_chunk$set(message = FALSE)
library(bslib)
library(dplyr)
library(ggplot2)
library(glue)
library(here)
library(lubridate)
library(plotly)
library(purrr)
library(readr)
library(rlang)
library(stringr)
library(tidyr)
theme_set(theme_bw())
input_dir <- params$input_dir # here("data")
aggregated_filetypes <- c("blamematrix", "catalog", "mimeo")
# TODO: only load last N weeks of data to keep RAM usage reasonably low
user_dat <- tibble(filename = list.dirs(input_dir) %>%
Filter(function(x) {
x != input_dir
}, .) %>%
lapply(function(x) {
list.files(x, full.names = TRUE)
}) %>%
unlist()) %>%
filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>%
separate_wider_delim(filename,
delim = ".", cols_remove = FALSE,
names = c("date", "path", "username", "file", "ext"),
too_few = "debug"
) %>%
filter(
str_detect(ext, "tsv|txt"), # only keep tab-delimited files
!str_detect(username, "[0-9]"), # filter out numeric usernames
username != "allusers"
) %>% # filter out the 'allusers' rows
mutate(date = as_date(basename(date)))
## Warning: Debug mode activated: adding variables `filename_ok`,
## `filename_pieces`, and `filename_remainder`.
dates <- user_dat %>%
pull(date) %>%
unique()
most_recent_date <- dates %>% max()
usernames <- user_dat %>%
pull(username) %>%
unique()
users_filter <- c("sovacoolkl", "kopardevn") # TODO optionally select certain users
user_dat %>% write_tsv(here("results", glue("user-dat_{today()}.tsv")))
Disk usage in /data/CCBR on Biowulf
summary_dat_recent <- user_dat %>%
filter( # username %in% users_filter,
date == most_recent_date, file == "summary"
) %>%
pull(filename) %>%
map(function(x) {
read_tsv(x) %>% mutate(filename = x)
}) %>%
list_rbind() %>%
separate_wider_delim(filename,
delim = ".", cols_remove = FALSE,
names = c("basepath", "path", "username", "file", "ext")
) %>%
filter(FolderPath == "/data/CCBR")
summary_dat_recent %>% write_tsv(here("results", glue("summary-dat-recent_{today()}.tsv")))
summary_metrics <- summary_dat_recent %>%
pivot_longer(where(is.numeric), names_to = "metric") %>%
pull(metric) %>%
unique()
top_users <- summary_dat_recent %>%
pivot_longer(all_of(summary_metrics),
names_to = "metric"
) %>%
mutate(value_adj = case_when(
str_detect(metric, "[sS]core") ~ -value,
TRUE ~ value
)) %>%
group_by(metric) %>%
slice_max(order_by = value_adj, n = 10) %>%
pull(username) %>%
unique()
plots <- summary_metrics %>% lapply(function(y_metric) {
user_order <- summary_dat_recent %>%
filter(username %in% top_users) %>%
pivot_longer(where(is.numeric),
names_to = "metric"
) %>%
mutate(value_adj = case_when(
str_detect(metric, "[sS]core") ~ -value,
TRUE ~ value
)) %>%
filter(metric == y_metric) %>%
arrange(by = value_adj) %>%
pull(username)
p <- summary_dat_recent %>%
filter(username %in% top_users) %>%
mutate(username = factor(username, levels = user_order)) %>%
ggplot(aes(
x = eval_tidy(data_sym(y_metric)),
y = username,
fill = eval_tidy(data_sym(y_metric)),
text = glue("{username}\n{y_metric}\n{FolderPath}")
)) +
geom_col() +
labs(x = y_metric, y = "") +
theme(legend.position = "none")
nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
})
do.call(navset_pill_list, plots)
summary_dat_all <- user_dat %>%
filter( # username %in% users_filter,
file == "summary"
) %>%
pull(filename) %>%
map(function(x) {
read_tsv(x) %>% mutate(filename = x)
}) %>%
list_rbind() %>%
separate_wider_delim(filename,
delim = ".", cols_remove = FALSE,
names = c("basepath", "path", "username", "file", "ext")
) %>%
mutate(date = str_replace(basepath, ".*/", "") %>% as_date()) %>%
filter(FolderPath == "/data/CCBR") # TODO: repeat for /data/CCBR_Pipeliner
summary_dat_all %>% write_tsv(here("results", glue("summary-dat-all_{today()}.tsv")))
top_users <- summary_dat_all %>%
pivot_longer(all_of(summary_metrics),
names_to = "metric"
) %>%
mutate(value_adj = case_when(
str_detect(metric, "[sS]core") ~ -value,
TRUE ~ value
)) %>%
group_by(metric) %>%
slice_max(order_by = value_adj, n = 10) %>%
pull(username) %>%
unique()
plots <- summary_metrics %>% lapply(function(y_metric) {
user_order <- summary_dat_all %>%
filter(username %in% top_users) %>%
pivot_longer(all_of(summary_metrics),
names_to = "metric"
) %>%
mutate(value_adj = case_when(
str_detect(metric, "[sS]core") ~ -value,
TRUE ~ value
)) %>%
filter(metric == y_metric) %>%
arrange(by = value_adj) %>%
pull(username)
p <- summary_dat_all %>%
filter(username %in% user_order) %>%
ggplot(aes(date, eval_tidy(data_sym(y_metric)),
color = username,
text = glue("{username}\n{y_metric}\n{FolderPath}\n{date}")
)) +
geom_line(alpha = 0.7) +
geom_point() +
labs(y = y_metric)
nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
})
do.call(navset_pill_list, plots)